#!/usr/bin/env bash

# Do a "ps" process status on a node-list, but exclude system processes
# Usage: psnode [-c columns] node-list
# Homepage: https://github.com/OleHolmNielsen/Slurm_tools/

# Default value:
export columns=100

# Parse command options
while getopts "c:" options; do
	case $options in
		c )	export columns=$OPTARG
			if ! [[ $columns =~ ^[0-9]+$ ]]
			then
				echo Number of columns $columns must be a positive number
				exit 1
			fi
			if [[ $columns -lt 5 ]] || [[ $columns -gt 1024 ]]
			then
				echo Unreasonable number of columns $columns 
				exit 1
			fi
			;;
		h | * ) echo "Usage: psnode [-c columns | -h] node-list"
			exit 1;;
	esac
done
shift $((OPTIND-1))


if test $# -ne 1
then
	echo ERROR:
	echo "Usage: psnode [-c columns | -h] node-list"
	exit -1
fi

PS="/bin/ps"
PSFLAGS="-o pid,nlwp,state,user,start,cputime,%cpu,rssize,command --columns $columns"
SSH="ssh -n -x"
PING="/bin/ping -c 1 -w 3"

# System users EXCLUDED from the process list
USERLIST="root rpc rpcuser daemon ntp smmsp sshd hpsmh named dbus 68 chrony polkitd munge"

# Make a deselect-list consisting only of existent users
deselect_list=""
sep=""
for u in $USERLIST
do
	if test -n "`getent passwd $u`"
	then
		# This user exists in the passwd database
		deselect_list="${deselect_list}${sep}${u}"
		sep=" "
	fi
done

sep=""
# The "scontrol show hostnames" command is used to expand NodeList expressions
for node in `scontrol show hostnames $*`
do
	if test -n "$sep"
	then
		echo $sep
	fi
	echo Node ${node} information:
	# Print node status information:
	sinfo -N  -O "NodeList:10 ,Partition:10 ,CPUs:4 ,CPUsLoad:9 ,SocketCoreThread:8 ,Memory:7 ,StateLong:11 ,Reason:20" -n $node
	echo Jobid list: `squeue -h -O JobID -w $node`
	echo Node ${node} user processes:
	if $PING $node >/dev/null 2>&1
	then
		# Count also the number of processes (numprocs) and threads (numthreads) when field $1 is a number
		$SSH $node $PS $PSFLAGS --deselect -u \""${deselect_list}"\" | awk '{print $0; if($1~/^[0-9]+$/) {numprocs++; numthreads+=$2}} END {printf("Total: %d processes and %d threads\n", numprocs, numthreads)}'
	else
		echo '*** WARNING ***' Cannot ping host ${node} !
	fi
	sep="====================================="
done
